"""
1, 定位到2022必看影片
2，从2022必看影片中提取到页面的链接地址
3，请求连接地址 拿到我们想要的下载地址
"""
import csv

import requests
import re

url = 'https://dytt89.com/'
resp = requests.get(url, verify=False)  # verify=False去掉安全验证
resp.encoding = 'gb2312'  # 换成网页的字符集

obj = re.compile(r'2022必看热片.*?<ul>(?P<ul>.*?)</ul>', re.S)
obj2 = re.compile(r"<li><a href='(?P<href>.*?)'", re.S)
obj3 = re.compile(r'◎片　　名　(?P<name>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="'
                  r'(?P<download>.*?)">', re.S)

child_href_list = []
for i in obj.finditer(resp.text):
    ul = i.group('ul')

    # 提取网页链接
    obj2_ul = obj2.finditer(ul)
    for i in obj2_ul:
        # print(i.group('href'))
        # 拼接页面的url地址 域名+子页面后缀地址
        child_href = url + i.group('href').strip('/')
        child_href_list.append(child_href)  # 把子页面保存起来

# 提取子页面内容
for page in child_href_list:
    # print(page)
    child_resp = requests.get(page, verify=False)
    child_resp.encoding = 'gb2312'
    # print(child_resp.text)
    res3 = obj3.search(child_resp.text)

    f = open('电影天堂.csv', 'a+', encoding='utf-8')
    csvwriter = csv.writer(f)
    dic = res3.groupdict()
    csvwriter.writerow(dic.values())
    f.close()
    # print(res3.group('name'))
    # print(res3.group('download'))
print('爬取完毕')
